R Markdown

This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.

When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:

library("dplyr")
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library("tidyr")
library("readr")

## dat_crime <- read.csv(unz("https://github.com/avennu01/three-amigos/blob/master/train.csv.zip","train.csv")) .. unable to direclty read data from GitHub

dat_crime <- read.csv("https://www.dropbox.com/s/kjkt5ndf3jkibq4/train.csv?raw=1")

table(dat_crime$Category)
## 
##                       ARSON                     ASSAULT 
##                        1513                       76876 
##                  BAD CHECKS                     BRIBERY 
##                         406                         289 
##                    BURGLARY          DISORDERLY CONDUCT 
##                       36755                        4320 
## DRIVING UNDER THE INFLUENCE               DRUG/NARCOTIC 
##                        2268                       53971 
##                 DRUNKENNESS                EMBEZZLEMENT 
##                        4280                        1166 
##                   EXTORTION             FAMILY OFFENSES 
##                         256                         491 
##      FORGERY/COUNTERFEITING                       FRAUD 
##                       10609                       16679 
##                    GAMBLING                  KIDNAPPING 
##                         146                        2341 
##               LARCENY/THEFT                 LIQUOR LAWS 
##                      174900                        1903 
##                   LOITERING              MISSING PERSON 
##                        1225                       25989 
##                NON-CRIMINAL              OTHER OFFENSES 
##                       92304                      126182 
##     PORNOGRAPHY/OBSCENE MAT                PROSTITUTION 
##                          22                        7484 
##           RECOVERED VEHICLE                     ROBBERY 
##                        3138                       23000 
##                     RUNAWAY             SECONDARY CODES 
##                        1946                        9985 
##       SEX OFFENSES FORCIBLE   SEX OFFENSES NON FORCIBLE 
##                        4388                         148 
##             STOLEN PROPERTY                     SUICIDE 
##                        4540                         508 
##              SUSPICIOUS OCC                        TREA 
##                       31414                           6 
##                    TRESPASS                   VANDALISM 
##                        7326                       44725 
##               VEHICLE THEFT                    WARRANTS 
##                       53781                       42214 
##                 WEAPON LAWS 
##                        8555
# seperating Dates in to Date & Time and selecting below categories only, rest have been eliminated based on group consensus since they dint have impact on our end objective

# Missing Person .. Need more deepdive 

dat_crimeNew <- dat_crime %>% separate(col = Dates, into = c("Date","Time"), sep = " ",fill = "right" ) %>% filter(Category %in% c("ASSAULT","DISORDERLY CONDUCT","DRUNKENNESS","DRUG/NARCOTIC","KIDNAPPING","LARCENY/THEFT","LOITERING","MISSING PERSON","NON-CRIMINAL","ROBBERY","SECONDARY CODES","SEX OFFENSES FORCIBLE","SEX OFFENSES NON FORCIBLE","STOLEN PROPERTY","SUSPICIOUS OCC","VANDALISM","VEHICLE THEFT"))

dat_occ <- dat_crimeNew %>% group_by(Category,Descript) %>% summarize(occurence= n())

# Based of above table, below considerations have been made 
# Descripts to consider apart from one's selected .... ASSAULT, BATTERY,STALKING,LOITERING,
# complete categories to consider "KIDNAPPING","SEX OFFENSES FORCIBLE","SEX OFFENSES NON FORCIBLE"
# complete categories to ingore "VANDALISM","VEHICLE THEFT"

dat_crimeNew <- dat_crime %>% separate(col = Dates, into = c("Date","Time"), sep = " ",fill = "right" ) %>% filter(Category %in% c("ASSAULT","DISORDERLY CONDUCT","DRUNKENNESS","DRUG/NARCOTIC","KIDNAPPING","LARCENY/THEFT","LOITERING","MISSING PERSON","NON-CRIMINAL","ROBBERY","SECONDARY CODES","SEX OFFENSES FORCIBLE","SEX OFFENSES NON FORCIBLE","STOLEN PROPERTY","SUSPICIOUS OCC"))

dat_occ <- dat_crimeNew %>% group_by(Category,Descript) %>% summarize(occurence= n())

p <- dat_occ %>% spread(Descript,occurence)

final_desc <- c("AGGRAVATED ASSAULT WITH","ASSAULT, AGGRAVATED, W/","ATTEMPTED HOMICIDE WITH","ATTEMPTED MAYHEM WITH","ATTEMPTED SIMPLE","BATTERY WITH","MAYHEM WITH","THREATS AGAINST","WILLFUL CRUELTY","COMMITTING PUBLIC","DISTURBING THE PEAC","MAINTAINING A PUBLIC","FOR SALE","SALE OF","UNDER INFLUENCE","UNDER THE INFLUENCE","ATTEMPTED PETTY THEFT","GRAND THEFT PICK","GRAND THEFT PURSE","PETTY THEFT","THEFT, DRUNK ROLL,","AIDED CASE, DOG","AIDED CASE, INJURED","ASSAULT TO ROB WITH ","ATTEMPTED ROBBERY ON THE STREET","ATTEMPTED ROBBERY WITH","ROBBERY ON THE STREET ","ROBBERY, ARMED WITH A ","ROBBERY, BODILY","ASSAULT BY JUVENILE","SHOOTING BY JUVENILE","ANNOY OR MOLEST","STOLEN CELLULAR PHONE","STOLEN ELECTRONICS","SUSPICIOUS A","SUSPICIOUS OCCU","SUSPICIOUS PER","MISSING")

select_desc <- function(x) { 
                   names(p %>% select(contains(x)))[-1]
      }

dat_crimefinal <- full_join(dat_crimeNew %>% filter(Descript %in% unlist(lapply(final_desc,select_desc))),dat_crimeNew %>% filter(Category %in% c("KIDNAPPING","SEX OFFENSES FORCIBLE","SEX OFFENSES NON FORCIBLE")))
## Joining by: c("Date", "Time", "Category", "Descript", "DayOfWeek", "PdDistrict", "Resolution", "Address", "X", "Y")
# Reduction in observations
dim(dat_crime)
## [1] 878049      9
dim(dat_crimeNew)
## [1] 509681     10
dim(dat_crimefinal)
## [1] 195711     10
# Based on the dat_occ_final we can categorise the level of serverity of crime if we want to show our results in such manner 
dat_occ_final <- dat_crimefinal %>% group_by(Category,Descript) %>% summarize(occurence= n())

cat_life <- dat_crimefinal %>% filter(Category %in% c("ASSAULT","KIDNAPPING","MISSING PERSON","SECONDARY CODES","SEX OFFENSES FORCIBLE","SEX OFFENSES NON FORCIBLE","SUSPICIOUS OCC"))
cat_prop <- dat_crimefinal %>% filter(Category %in% c("STOLEN PROPERTY","ROBBERY","LARCENY/THEFT"))
cat_nui <- dat_crimefinal %>% filter(Category %in% c("DISORDERLY CONDUCT","DRUG/NARCOTIC","DRUNKENNESS","NON-CRIMINAL"))
library(ggmap)
## Warning: package 'ggmap' was built under R version 3.2.5
## Loading required package: ggplot2
x <- dat_crimefinal$X
y <- dat_crimefinal$Y

map_SF <- get_map(location = "San Francisco", zoom = 12)
## Map from URL : http://maps.googleapis.com/maps/api/staticmap?center=San+Francisco&zoom=12&size=640x640&scale=2&maptype=terrain&language=en-EN&sensor=false
## Information from URL : http://maps.googleapis.com/maps/api/geocode/json?address=San%20Francisco&sensor=false
map <- ggmap(map_SF)


#https://rpubs.com/nickbearman/r-google-map-making
#plot the roads Google Maps basemap
#map <- qmap('San Francisco', zoom = 15, maptype = 'roadmap')
#plot the density map
#map + stat_density2d(aes(x = x, y = y, fill = ..level.., alpha = ..level..), data = dat_crimefinal, geom = "polygon") + scale_fill_gradient(low = "blue", high = "red")

#http://www.r-bloggers.com/contour-and-density-layers-with-ggmap/

#Heat map using all data across all categories
W <- dat_crimefinal
ggmap(map_SF, extent = "panel", maprange=FALSE) +
   geom_density2d(data = W, aes(x = x, y = y)) +
   stat_density2d(data = W, aes(x = x, y = y,  fill = ..level.., alpha = ..level..),
                  size = 0.01, bins = 16, geom = 'polygon') +
   scale_fill_gradient(low = "blue", high = "red") +
   theme(legend.position = "none")
## Warning: Removed 9 rows containing non-finite values (stat_density2d).
## Warning: Removed 9 rows containing non-finite values (stat_density2d).

#Heat map across category deemed as property
x1 <- cat_prop$X
y1 <- cat_prop$Y

ggmap(map_SF, extent = "panel", maprange=FALSE) +
   geom_density2d(data = cat_prop, aes(x = x1, y = y1)) +
   stat_density2d(data = cat_prop, aes(x = x1, y = y1,  fill = ..level.., alpha = ..level..),
                  size = 0.01, bins = 16, geom = 'polygon') +
   scale_fill_gradient(low = "green", high = "black") +
   theme(legend.position = "none")
## Warning: Removed 3 rows containing non-finite values (stat_density2d).
## Warning: Removed 3 rows containing non-finite values (stat_density2d).

#Heat map across category deemed as life
x2 <- cat_life$X
y2 <- cat_life$Y

ggmap(map_SF, extent = "panel", maprange=FALSE) +
   geom_density2d(data = cat_life, aes(x = x2, y = y2)) +
   stat_density2d(data = cat_life, aes(x = x2, y = y2,  fill = ..level.., alpha = ..level..),
                  size = 0.01, bins = 16, geom = 'polygon') +
   scale_fill_gradient(low = "yellow", high = "blue") +
   theme(legend.position = "none")
## Warning: Removed 4 rows containing non-finite values (stat_density2d).
## Warning: Removed 4 rows containing non-finite values (stat_density2d).

#Heat map across category deemed as nuisance
x3 <- cat_nui$X
y3 <- cat_nui$Y

ggmap(map_SF, extent = "panel", maprange=FALSE) +
   geom_density2d(data = cat_nui, aes(x = x3, y = y3)) +
   stat_density2d(data = cat_nui, aes(x = x3, y = y3,  fill = ..level.., alpha = ..level..),
                  size = 0.01, bins = 16, geom = 'polygon') +
   scale_fill_gradient(low = "yellow", high = "red") +
   theme(legend.position = "none")
## Warning: Removed 2 rows containing non-finite values (stat_density2d).
## Warning: Removed 2 rows containing non-finite values (stat_density2d).

#Scatter plots
#https://rpubs.com/hegupta/151080

dat <- dat_crimefinal %>% mutate(class = ifelse(Category %in% c("STOLEN PROPERTY","ROBBERY","LARCENY/THEFT"), "Prop", ifelse(Category %in% c("DISORDERLY CONDUCT","DRUG/NARCOTIC","DRUNKENNESS","NON-CRIMINAL"), "Nui", "Life")))

dat1 <- dat %>% group_by(Category) %>% summarize(occurence = n())
#Histogram by Category
qplot(Category, data = dat, geom = "bar", fill = Category) +
    ggtitle("Crime Categories in San Francisco") +
    xlab("Category") + 
    ylab("Occurence")

qplot(dat$PdDistrict, data = dat, geom = "bar", fill = class) +
    scale_x_discrete(label = abbreviate) +
    ggtitle("Crime by district and class in San Francisco") +
    xlab("District") + 
    ylab("Crimes by class")

#Further analysis of Southern District

#Daily average crime by class in southern district
dat2 <- dat %>% filter(PdDistrict == "SOUTHERN") %>% group_by(DayOfWeek, class) %>% summarize(DailyAvg = mean(n()))

ggplot(dat2, aes(x = dat2$DayOfWeek, y = dat2$DailyAvg, fill = factor(dat2$class) )) + geom_bar(stat="identity") + labs(x = "Day of Week", y = "Daily Average", title = "Daily Avg Crime by class for Southern District")

#Time of Day analysis for Southern District
dat3 <- dat %>% filter(PdDistrict=="SOUTHERN")

time <- as.data.frame(table(dat3$Time),stringsAsFactors = FALSE)
time$Hour <- substr(time$Var1, 1, 2)
TimeOfDay <- ggplot(time, aes(x = Hour, Freq))
TimeOfDay + stat_summary(fun.y = sum, geom = "bar", fill = "grey") +
    labs(x = "Time of Day", y = "Occurrences", title ="Southern District crime timings")